#setwd("/Users/javier/Documents/Jupyter/BMI_6106_2023/HomeWorks/HW4")
expectancy = read.csv("Life_Expectancy_Data.csv")
#mean imputation
for(i in 1:ncol(expectancy)) {
expectancy[ , i][is.na(expectancy[ , i])] <- median(expectancy[ , i], na.rm=TRUE)
}
#Data Exploration
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(faraway)
#explore summary statistics
summary(expectancy)
## Adult.Mortality infant.deaths Alcohol percentage.expenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.010 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 1.093 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.755 Median : 64.913
## Mean :164.7 Mean : 30.3 Mean : 4.547 Mean : 738.251
## 3rd Qu.:227.0 3rd Qu.: 22.0 3rd Qu.: 7.390 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.870 Max. :19479.912
## Hepatitis.B Measles BMI under.five.deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:82.00 1st Qu.: 0.0 1st Qu.:19.40 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :83.02 Mean : 2419.6 Mean :38.38 Mean : 42.04
## 3rd Qu.:96.00 3rd Qu.: 360.2 3rd Qu.:56.10 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## Polio Total.expenditure Diphtheria HIV.AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.370 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.62 Mean : 5.924 Mean :82.39 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.330 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## GDP Population Income.composition.of.resources
## Min. : 1.68 Min. :3.400e+01 Min. :0.0000
## 1st Qu.: 580.49 1st Qu.:4.189e+05 1st Qu.:0.5042
## Median : 1766.95 Median :1.387e+06 Median :0.6770
## Mean : 6611.52 Mean :1.023e+07 Mean :0.6304
## 3rd Qu.: 4779.41 3rd Qu.:4.584e+06 3rd Qu.:0.7720
## Max. :119172.74 Max. :1.294e+09 Max. :0.9480
## Schooling Life.expectancy
## Min. : 0.00 Min. :36.30
## 1st Qu.:10.30 1st Qu.:63.20
## Median :12.30 Median :72.10
## Mean :12.01 Mean :69.23
## 3rd Qu.:14.10 3rd Qu.:75.60
## Max. :20.70 Max. :89.00
#check NA
colSums(is.na(expectancy))
## Adult.Mortality infant.deaths
## 0 0
## Alcohol percentage.expenditure
## 0 0
## Hepatitis.B Measles
## 0 0
## BMI under.five.deaths
## 0 0
## Polio Total.expenditure
## 0 0
## Diphtheria HIV.AIDS
## 0 0
## GDP Population
## 0 0
## Income.composition.of.resources Schooling
## 0 0
## Life.expectancy
## 0
#check distribution of each variable - check for normailty and skewness
#reshape data into long format
df_long <- expectancy %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
#plot distributions
ggplot(df_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.5) +
facet_wrap(~ Variable, scales = "free") +
theme_minimal()
#these plots show that many of the variables don't have a normal distribution, we will need to do some transformations on them. Later on in this problem set I will do transformations, but I wanted to continue on with the raw data and see what I find with that first. Then I do transformations on the variables and continue to find the best model.
#Use VIF to check for collinearity
#check vif without the dependent variable (Life.expectancy)
exp_wo_dep = expectancy[,-17]
#check the columns we have to make sure we took out the correct variable, commenting it out for my final code
#colnames(exp_wo_dep)
#check for VIF above 10
vif(exp_wo_dep)
## Adult.Mortality infant.deaths
## 1.714540 175.771885
## Alcohol percentage.expenditure
## 1.505774 5.736699
## Hepatitis.B Measles
## 1.303353 1.372957
## BMI under.five.deaths
## 1.510797 175.582543
## Polio Total.expenditure
## 1.936480 1.171291
## Diphtheria HIV.AIDS
## 2.154278 1.408441
## GDP Population
## 5.962848 1.488814
## Income.composition.of.resources Schooling
## 2.977155 3.294586
#this is a correlation plot that my knitr could not render
#correlation_plot <-cor(expectancy)
#corrplot(correlation_plot, type="upper", order="hclust", col=brewer.pal(n=8, name="RdYlBu"))
Based on our data exploration, I found the variables infant.deaths and under.five.deaths have a VIF > 10, meaning we will need to take them out. I also can see from the distribution that we will need to do some transformations on the data. Later on in this problem set I will do transformations, but I want to continue on with the raw data and see what I find with that first. Then I do transformations on the variables and continue to find the best model.
#lm with all data
colnames(expectancy)
## [1] "Adult.Mortality" "infant.deaths"
## [3] "Alcohol" "percentage.expenditure"
## [5] "Hepatitis.B" "Measles"
## [7] "BMI" "under.five.deaths"
## [9] "Polio" "Total.expenditure"
## [11] "Diphtheria" "HIV.AIDS"
## [13] "GDP" "Population"
## [15] "Income.composition.of.resources" "Schooling"
## [17] "Life.expectancy"
all = lm(Life.expectancy ~ .,data = expectancy)
summary(all)
##
## Call:
## lm(formula = Life.expectancy ~ ., data = expectancy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.7726 -2.1905 -0.0432 2.3819 16.6918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.393e+01 5.356e-01 100.677 < 2e-16 ***
## Adult.Mortality -2.047e-02 7.945e-04 -25.771 < 2e-16 ***
## infant.deaths 9.760e-02 8.464e-03 11.530 < 2e-16 ***
## Alcohol 1.376e-01 2.356e-02 5.843 5.69e-09 ***
## percentage.expenditure 8.313e-05 9.071e-05 0.916 0.359506
## Hepatitis.B -1.583e-02 3.738e-03 -4.236 2.35e-05 ***
## Measles -1.925e-05 7.693e-06 -2.502 0.012399 *
## BMI 4.940e-02 4.642e-03 10.642 < 2e-16 ***
## under.five.deaths -7.377e-02 6.218e-03 -11.864 < 2e-16 ***
## Polio 2.842e-02 4.484e-03 6.338 2.69e-10 ***
## Total.expenditure 1.048e-01 3.394e-02 3.087 0.002040 **
## Diphtheria 4.034e-02 4.671e-03 8.636 < 2e-16 ***
## HIV.AIDS -4.774e-01 1.760e-02 -27.133 < 2e-16 ***
## GDP 4.694e-05 1.383e-05 3.395 0.000695 ***
## Population -1.884e-10 1.701e-09 -0.111 0.911802
## Income.composition.of.resources 5.921e+00 6.333e-01 9.350 < 2e-16 ***
## Schooling 6.744e-01 4.185e-02 16.114 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.08 on 2921 degrees of freedom
## Multiple R-squared: 0.8169, Adjusted R-squared: 0.8159
## F-statistic: 814.5 on 16 and 2921 DF, p-value: < 2.2e-16
#We can see that the normality and residual plot are not great from this model
plot(all)
#we need to take out some variables
#start by taking out the VIF values greater than 10 (take out infant.deaths, under.five.deaths)
lm1 = lm(Life.expectancy ~ Adult.Mortality + Alcohol + Hepatitis.B + BMI + Polio +
Diphtheria + GDP + Income.composition.of.resources + percentage.expenditure +
Measles + Total.expenditure + HIV.AIDS + Population + Schooling, data = expectancy)
#I am commenting out the summary and plot of this model to make my project more concise. If interest you can remove the # mark and run them.
#summary(lm1)
#There is some improvement but it is still not as good as we want it
#plot(lm1)
#taking out variables that had a higher p-value than .05 from summary(1m1) (they are the same as summary(all)), the variables with p-values higher than .05 is percantage.expenditure and Population
lm2 = lm(Life.expectancy ~ Adult.Mortality + Alcohol + Hepatitis.B + BMI + Polio +
Diphtheria + GDP + Income.composition.of.resources +
Measles + Total.expenditure + HIV.AIDS + Schooling, data = expectancy)
summary(lm2)
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + Alcohol + Hepatitis.B +
## BMI + Polio + Diphtheria + GDP + Income.composition.of.resources +
## Measles + Total.expenditure + HIV.AIDS + Schooling, data = expectancy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.8752 -2.2305 -0.0429 2.4080 17.4575
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.265e+01 5.316e-01 99.039 < 2e-16 ***
## Adult.Mortality -2.096e-02 8.128e-04 -25.794 < 2e-16 ***
## Alcohol 9.964e-02 2.360e-02 4.223 2.49e-05 ***
## Hepatitis.B -1.828e-02 3.781e-03 -4.835 1.40e-06 ***
## BMI 5.032e-02 4.721e-03 10.661 < 2e-16 ***
## Polio 3.176e-02 4.581e-03 6.932 5.08e-12 ***
## Diphtheria 4.778e-02 4.738e-03 10.083 < 2e-16 ***
## GDP 5.402e-05 6.611e-06 8.171 4.49e-16 ***
## Income.composition.of.resources 6.472e+00 6.449e-01 10.036 < 2e-16 ***
## Measles -3.594e-05 6.905e-06 -5.204 2.08e-07 ***
## Total.expenditure 1.146e-01 3.432e-02 3.338 0.000853 ***
## HIV.AIDS -4.863e-01 1.799e-02 -27.026 < 2e-16 ***
## Schooling 7.018e-01 4.272e-02 16.427 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.18 on 2925 degrees of freedom
## Multiple R-squared: 0.8076, Adjusted R-squared: 0.8068
## F-statistic: 1023 on 12 and 2925 DF, p-value: < 2.2e-16
#We have gotten more improvement but we are going to have to do variable transformations in order to get better results
plot(lm2)
I first started will all the variables to see what the initial model
looks like, then I took out the two variables that had a VIF > 10,
then I took out the variables that were not signficant in the model.
These model are not very good, in order to improve them we will need to
do some transformations on different variables. We do those
transformations here and then we continue to find the best model.
#Transform variables that are skewed and retest these out.
exp_transform = expectancy
#Taking out Infant.deaths and under.five.deaths becuase their VIF was above 10
exp_transform <- exp_transform[, !names(exp_transform) %in% c("infant.deaths","under.five.deaths")]
#Right skewed variables:
#mild: Adult.Mortalit, Alcohol -> sqrt transformation
#extreme: GDP, HIV.AIDS, Measles, percentage.expenditure, Population -> log transformation
exp_transform$Adult.Mortality = sqrt(exp_transform$Adult.Mortality)
exp_transform$Alcohol = sqrt(exp_transform$Alcohol)
exp_transform$GDP = log(exp_transform$GDP)
exp_transform$HIV.AIDS <- log(exp_transform$HIV.AIDS)
exp_transform$Measles <- log(exp_transform$Measles)
exp_transform$percentage.expenditure <- log(exp_transform$percentage.expenditure)
exp_transform$Population <- log(exp_transform$Population)
#remove NA values
exp_transform$percentage.expenditure[is.infinite(exp_transform$percentage.expenditure)] <- NA
exp_transform$Measles[is.infinite(exp_transform$Measles)] <- NA
exp_transform <- na.omit(exp_transform)
#Left skewed variables:
#Income.composition.of.resources, Diphtheria, Polio -> x^2 or x^3 transformation
exp_transform$Income.composition.of.resources <- exp_transform$Income.composition.of.resources^2
exp_transform$Diphtheria <- exp_transform$Diphtheria^3
exp_transform$Polio <- exp_transform$Polio^3
#check distribution of each transformered variable - check for normailty and skewness
#reshape data into long format
df_long <- exp_transform %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
#plot distributions
ggplot(df_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.5) +
facet_wrap(~ Variable, scales = "free") +
theme_minimal()
lm_transform <- lm(Life.expectancy ~ ., data = exp_transform)
vif(lm_transform)
## Adult.Mortality Alcohol
## 1.569374 1.756195
## percentage.expenditure Hepatitis.B
## 11.705395 1.245133
## Measles BMI
## 1.310495 1.952856
## Polio Total.expenditure
## 5.507489 1.248578
## Diphtheria HIV.AIDS
## 5.676954 2.403105
## GDP Population
## 11.728688 1.079916
## Income.composition.of.resources Schooling
## 7.315676 5.862327
#Based on our new transformations, we can see that percentage.expenditure and GDP have a VIF of over 10, so we will not use both of these variavles
summary(lm_transform)
##
## Call:
## lm(formula = Life.expectancy ~ ., data = exp_transform)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4210 -1.8394 -0.1346 1.8248 12.8967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.904e+01 9.770e-01 60.427 < 2e-16 ***
## Adult.Mortality -1.948e-01 2.033e-02 -9.582 < 2e-16 ***
## Alcohol -2.206e-01 1.025e-01 -2.152 0.03158 *
## percentage.expenditure 7.006e-01 1.339e-01 5.234 1.88e-07 ***
## Hepatitis.B -1.907e-02 4.186e-03 -4.557 5.61e-06 ***
## Measles -5.824e-02 3.450e-02 -1.688 0.09162 .
## BMI 5.447e-03 6.066e-03 0.898 0.36934
## Polio 1.037e-06 6.556e-07 1.582 0.11393
## Total.expenditure 2.570e-03 3.874e-02 0.066 0.94711
## Diphtheria 1.827e-06 6.640e-07 2.751 0.00601 **
## HIV.AIDS -2.548e+00 7.565e-02 -33.679 < 2e-16 ***
## GDP -4.421e-01 1.495e-01 -2.957 0.00315 **
## Population -8.967e-02 3.482e-02 -2.575 0.01011 *
## Income.composition.of.resources 1.574e+01 9.719e-01 16.192 < 2e-16 ***
## Schooling 2.811e-01 5.888e-02 4.774 1.98e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.415 on 1536 degrees of freedom
## Multiple R-squared: 0.8962, Adjusted R-squared: 0.8952
## F-statistic: 947 on 14 and 1536 DF, p-value: < 2.2e-16
#Based on the new transformations, we find that the variables Measles, Total.expenditure, GDP, and Population are not significant. We will take these out of our next model.
lm3 = lm(Life.expectancy ~ Adult.Mortality + Alcohol + Hepatitis.B + BMI + Polio + Diphtheria + Income.composition.of.resources + percentage.expenditure + Schooling, data = exp_transform)
summary(lm3)
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + Alcohol + Hepatitis.B +
## BMI + Polio + Diphtheria + Income.composition.of.resources +
## percentage.expenditure + Schooling, data = exp_transform)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.8379 -2.4997 0.1005 2.7708 19.9504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.698e+01 7.654e-01 74.450 < 2e-16 ***
## Adult.Mortality -4.296e-01 2.515e-02 -17.079 < 2e-16 ***
## Alcohol -1.143e+00 1.277e-01 -8.948 < 2e-16 ***
## Hepatitis.B -2.429e-02 5.493e-03 -4.422 1.04e-05 ***
## BMI 4.191e-02 7.738e-03 5.417 7.03e-08 ***
## Polio 2.278e-06 8.632e-07 2.639 0.00840 **
## Diphtheria 3.544e-06 8.726e-07 4.062 5.11e-05 ***
## Income.composition.of.resources 2.364e+01 1.223e+00 19.339 < 2e-16 ***
## percentage.expenditure 2.032e-01 7.408e-02 2.743 0.00616 **
## Schooling 3.268e-01 7.604e-02 4.298 1.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.508 on 1541 degrees of freedom
## Multiple R-squared: 0.8185, Adjusted R-squared: 0.8174
## F-statistic: 772.1 on 9 and 1541 DF, p-value: < 2.2e-16
plot(lm3)
#This is the best we have seen the residual and normality plot. The residuals are decently good showing no pattern in the plot. The normality plot is decent, based on the tails there are probably outliers that are having an effect on the normality.
res = resid(lm3)
#Create density plot of residuals
plot(density(res))
This is the best we have seen the residual and normality plot. The
residuals are decently good showing no pattern in the plot. The
normality plot is decent, based on the tails there are probably outliers
that are having an effect on the normality. We removed transformed
variables that had a VIF > 10 and then were not signficant in the
model.